Your client is a mental health expert from an NGO who is interested in understanding more about gaming and the potentially addictive effect it can have on some individuals. You are meeting the client in a few days and they would like you to extract and present insights from the Steam dataset to help them in their research.
Please use whichever tools you feel the most comfortable with, but we do recommend Tableau which is a popular choice. Tableau is free for students and there is also a free trial available here.
# Make sure script changes take effect within this session
%load_ext autoreload
%autoreload 2
# import some useful packages for this analysis, start spark session
from setup import *
spark
Open localhost:4040 to monitor the spark UI
# list of data we want to load
dfs = ['Player_Summaries', 'Games_Publishers', 'Games_Genres', 'Games_Developers', 'Games_1', 'Games_2', 'Friends', 'Groups', 'App_ID']
# load raw data and count rows
spark_handler = spark_df_handler()
print('Number of records \n')
for each in dfs:
spark_handler.load(each)
print('{0} : {1}'.format(each, str(spark_handler.dfraw[each].count())))
# Create timestamp vars and select vars of interest
player_summaries = spark_handler.dfraw[dfs[0]]
player_summaries = player_summaries.withColumn('datetimecreated', F.to_timestamp(player_summaries.timecreated,'yyyy-MM-dd HH:mm:ss'))
player = player_summaries.select('steamid', 'primaryclanid', 'datetimecreated')
# make a seperate df with additional info on signup times
player_signup = player.withColumn('year_created', F.date_trunc('year', player_summaries['datetimecreated']))
player_signup = player_signup.withColumn('month_created', F.date_trunc('month', player_signup['datetimecreated']))
player_signup = player_signup.withColumn('day_created', F.date_trunc('day', player_signup['datetimecreated']))
player_signup = player_signup.withColumn('dayofweek_created', F.dayofweek('datetimecreated').cast('byte'))
player_signup = player_signup.withColumn('dayhour_created', F.date_trunc('hour', player_signup['datetimecreated']))
player_signup = player_signup.withColumn('hour_created', F.hour('datetimecreated').cast('byte'))
# concatenate game tables
games = spark_handler.dfraw[dfs[4]] #.union(spark_handler.dfraw[dfs[5]])
# load app table
app_info = spark_handler.dfraw[dfs[8]]
# join games and appinfo
games = games.join(app_info, 'appid', how = 'left')
# calculate player stats using the game table
player_stats = games.groupby('steamid').agg(F.sum('playtime_2weeks').alias('playtime_2weeks_player'),
F.sum('playtime_forever').alias('playtime_forever_player'),
F.countDistinct('appid').alias('game_count_player'),
F.sum('Is_Multiplayer').alias('multiplayer_count_player'),
F.sum('Price').alias('spending_player'))
# fill in zeros for players that didn't play in the two weeks prior, and convert times to hours
player_stats = player_stats.fillna(0)
player_stats = player_stats.withColumn('playtime_average_player', F.col('playtime_2weeks_player') / (14 * 60))
player_stats = player_stats.drop('playtime_2weeks_player')
# calcuate fraction of multiplayer games
player_stats = player_stats.withColumn('multiplayer_fraction_player', F.col('multiplayer_count_player') / F.col('game_count_player'))
# Save player stats
# player_stats_pd = player_stats.toPandas()
# player_stats_pd.to_csv(os.path.join(path, '../player_stats.csv'))
# player_stats_pd = pd.read_csv(os.path.join(path, '../player_stats.csv'), index_col = False)
# load group table
groups = spark_handler.dfraw[dfs[7]]
# count number of groups per player
group_stats = groups.groupby('steamid').agg(F.countDistinct('groupid').alias('group_count_player'))
# load friends table and create some vars
friends = spark_handler.dfraw[dfs[6]]
friends = friends.withColumn('friend_since', F.to_timestamp(friends.friend_since,'yyyy-MM-dd HH:mm:ss'))
friends = friends.withColumn('friend_since_day', F.date_trunc('day', friends['friend_since']))
friends = friends.withColumn('friend_since_month', F.date_trunc('month', friends['friend_since']))
# join friends table with player stats
friends_stats = friends.join(player_stats, friends.steamid_b == player_stats.steamid, how = 'left')
# calculate friends' stats
agg_friends_stats = friends_stats.groupby('steamid_a').agg(F.countDistinct('steamid_b').alias('friend_count'),
F.mean('playtime_average_player').alias('playtime_average_friends'),
F.max('playtime_average_player').alias('playtime_average_friends_max'),
F.mean('playtime_forever_player').alias('playtime_forever_friends'),
F.max('playtime_forever_player').alias('playtime_forever_friends_max'),
F.mean('game_count_player').alias('game_count_friends'),
F.mean('multiplayer_count_player').alias('multiplayer_count_friends'),
F.mean('multiplayer_fraction_player').alias('multiplayer_fraction_friends'),
F.mean('spending_player').alias('spending_friends'))
# Join it all together
user_summary = player_stats.join(player, 'steamid', how = 'left')
user_summary = user_summary.join(group_stats, 'steamid', how = 'left')
user_summary = user_summary.join(agg_friends_stats, user_summary.steamid == agg_friends_stats.steamid_a, how = 'left')
# Save output
user_summary_pd = user_summary.toPandas()
user_summary_pd.to_csv(os.path.join(path, '../user_summary.csv'))
# Read output
user_summary_pd = pd.read_csv(os.path.join(path, '../user_summary.csv'), index_col = False)
# Check sanity
user_summary_pd.describe()
# get aggregates per country
player_location = player_summaries.groupby('loccountrycode').count().toPandas()
# load country borders
world_borders = gpd.read_file(os.path.join(path, '../geo_files/TM_WORLD_BORDERS-0.3.shp'))
# merge borders with agg stats
map_df = world_borders.merge(player_location, how = 'left', left_on = 'ISO2', right_on = 'loccountrycode')
# get density and per capita
map_df['player_density'] = map_df['count'] / map_df.AREA
map_df['player_per_cap'] = map_df['count'] / map_df.POP2005 * 100
# fill missing values
map_df = map_df.replace(np.inf, 0).fillna(0)
# map it out
m = folium.Map(location=[50, 0],
tiles='CartoDB positron',
zoom_start=1.5,
control_scale = True)
colorscale_t = branca.colormap.linear.YlOrRd_09.scale(0, 100000)
colorscale_t.caption = '# of players registered'
folium.GeoJson(map_df, name='players per country',
style_function=lambda x: {"weight": 0.2, 'color': 'black', 'fillColor':colorscale_t(x['properties']['count']), 'fillOpacity':0.9},
highlight_function=lambda x: {'weight':1, 'color':'black'},
smooth_factor=1.0,
show = True,
tooltip=folium.features.GeoJsonTooltip(fields = ['NAME','count', 'player_density', 'player_per_cap'],
aliases=['Country','Number of Players', 'Players / 10 sqkm', 'Players / 100 people' ],
labels=True,
sticky=True,
localize=True
)).add_to(m)
folium.LayerControl().add_to(m)
colorscale_t.add_to(m, 'scale')
m.save(os.path.join(git, '../html_maps/players_per_country.html'))
m